## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
Part of the idea is that for the Laplace noising to work we have to plug in a sigma (level of noising). We simulate having a very good methodology to do so by supplying dCal a large calibration set to pick sigma. In practice you don’t have such a set and would need to either know sigma from first principles or experience, or use some of your training data to build it. What we want to demonstrate is the effectiveness of the differential privacy inspired Laplace nosing technique, so we will give it a good sigma (which one may or may not have in actual practice).
print('naive effects model')
## [1] "naive effects model"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36368 -0.38461 0.00547 0.37974 1.76628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08056 0.01285 6.270 4.44e-10 ***
## x1 0.26619 0.02110 12.617 < 2e-16 ***
## x2 0.25160 0.03405 7.389 2.18e-13 ***
## x3 0.19060 0.05277 3.612 0.000312 ***
## x4 0.27750 0.02483 11.178 < 2e-16 ***
## x5 0.27266 0.02451 11.122 < 2e-16 ***
## x6 0.21281 0.03761 5.658 1.75e-08 ***
## x7 0.25934 0.02207 11.752 < 2e-16 ***
## x8 0.28974 0.02194 13.208 < 2e-16 ***
## x9 0.24599 0.02020 12.175 < 2e-16 ***
## x10 0.26001 0.01797 14.465 < 2e-16 ***
## n1 0.09067 0.01400 6.475 1.19e-10 ***
## n2 0.09275 0.01416 6.552 7.26e-11 ***
## n3 0.11523 0.01392 8.277 2.31e-16 ***
## n4 0.08853 0.01403 6.308 3.48e-10 ***
## n5 0.07794 0.01441 5.410 7.09e-08 ***
## n6 0.09353 0.01446 6.468 1.25e-10 ***
## n7 0.08617 0.01443 5.974 2.74e-09 ***
## n8 0.10136 0.01409 7.196 8.80e-13 ***
## n9 0.09799 0.01410 6.949 5.00e-12 ***
## n10 0.10318 0.01430 7.214 7.70e-13 ***
## n11 0.11272 0.01402 8.041 1.53e-15 ***
## n12 0.08253 0.01430 5.773 9.04e-09 ***
## n13 0.10629 0.01451 7.324 3.51e-13 ***
## n14 0.10574 0.01409 7.502 9.46e-14 ***
## n15 0.11166 0.01427 7.827 8.13e-15 ***
## n16 0.08986 0.01379 6.518 9.01e-11 ***
## n17 0.08851 0.01406 6.295 3.78e-10 ***
## n18 0.09905 0.01404 7.056 2.36e-12 ***
## n19 0.09338 0.01476 6.328 3.07e-10 ***
## n20 0.11530 0.01387 8.311 < 2e-16 ***
## n21 0.09641 0.01424 6.769 1.71e-11 ***
## n22 0.10090 0.01397 7.225 7.15e-13 ***
## n23 0.08629 0.01374 6.282 4.10e-10 ***
## n24 0.10409 0.01408 7.392 2.12e-13 ***
## n25 0.08755 0.01444 6.063 1.59e-09 ***
## n26 0.09146 0.01401 6.528 8.46e-11 ***
## n27 0.10086 0.01433 7.038 2.69e-12 ***
## n28 0.10048 0.01373 7.316 3.72e-13 ***
## n29 0.09177 0.01455 6.305 3.55e-10 ***
## n30 0.08877 0.01417 6.265 4.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5746 on 1959 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.9255
## F-statistic: 622.2 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.568682746837468"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'naive effects model train',
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: Removed 5 rows containing missing values (geom_path).
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.136]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.80270755169949"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'naive effects model test',
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.285]
# try re-fitting pred to see if it is just a shift/scale issue
# it is unlikely to be such, but good to look
f2 <- paste(yName,'pred',sep=' ~ ')
m2 <- lm(f2,data=dTestB) # dileberately fitting on test itself (to show it does not help)
print(summary(m2))
##
## Call:
## lm(formula = f2, data = dTestB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6308 -1.2125 -0.0242 1.1666 6.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08973 0.01789 -5.016 5.37e-07 ***
## pred 1.35606 0.02195 61.781 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.779 on 9998 degrees of freedom
## Multiple R-squared: 0.2763, Adjusted R-squared: 0.2762
## F-statistic: 3817 on 1 and 9998 DF, p-value: < 2.2e-16
dTestB$pred2 <- predict(m2,newdata=dTestB)
print(paste('adjusted test rmse',rmse(dTestB$pred2,dTestB[[yName]])))
## [1] "adjusted test rmse 1.77849696252789"
print(WVPlots::ScatterHist(dTestB,'pred2',yName,
'naive effects model test (adjusted on test)',
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.434]
print(paste('effects model, sigma=',bSigmaBest))
## [1] "effects model, sigma= 12"
bCoder <- trainEffectCoderR(dTrain,yName,vars,bSigmaBest)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3576 -0.6985 -0.0127 0.6712 4.0484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.074222 0.023319 3.183 0.001481 **
## x1 0.790093 0.035577 22.208 < 2e-16 ***
## x2 0.847127 0.061765 13.715 < 2e-16 ***
## x3 0.658072 0.084838 7.757 1.39e-14 ***
## x4 0.788243 0.037567 20.982 < 2e-16 ***
## x5 0.902215 0.043730 20.631 < 2e-16 ***
## x6 0.797638 0.065822 12.118 < 2e-16 ***
## x7 0.868730 0.036699 23.672 < 2e-16 ***
## x8 0.936754 0.034456 27.187 < 2e-16 ***
## x9 0.934770 0.034868 26.809 < 2e-16 ***
## x10 0.860513 0.029248 29.421 < 2e-16 ***
## n1 0.005342 0.005571 0.959 0.337721
## n2 0.012332 0.006107 2.019 0.043576 *
## n3 0.023078 0.006372 3.622 0.000300 ***
## n4 0.012304 0.005980 2.058 0.039770 *
## n5 0.012269 0.006383 1.922 0.054740 .
## n6 0.011682 0.006432 1.816 0.069511 .
## n7 0.013200 0.006100 2.164 0.030603 *
## n8 0.017830 0.005981 2.981 0.002906 **
## n9 0.008691 0.005759 1.509 0.131450
## n10 0.008325 0.006495 1.282 0.200033
## n11 0.011927 0.006221 1.917 0.055357 .
## n12 0.006315 0.006958 0.907 0.364259
## n13 0.010086 0.005543 1.820 0.068959 .
## n14 0.018009 0.006281 2.867 0.004183 **
## n15 0.018781 0.006842 2.745 0.006108 **
## n16 0.026104 0.005460 4.781 1.87e-06 ***
## n17 0.023657 0.006369 3.715 0.000209 ***
## n18 0.020600 0.005833 3.532 0.000422 ***
## n19 0.021373 0.006597 3.240 0.001216 **
## n20 0.006881 0.006214 1.107 0.268342
## n21 0.021935 0.005898 3.719 0.000206 ***
## n22 0.018873 0.006429 2.936 0.003369 **
## n23 0.015765 0.005858 2.691 0.007177 **
## n24 0.018168 0.006600 2.753 0.005967 **
## n25 0.009150 0.005816 1.573 0.115808
## n26 0.017000 0.006432 2.643 0.008286 **
## n27 0.023230 0.006220 3.735 0.000193 ***
## n28 0.020311 0.007037 2.887 0.003938 **
## n29 0.012984 0.006928 1.874 0.061045 .
## n30 0.012488 0.005909 2.113 0.034693 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.028 on 1959 degrees of freedom
## Multiple R-squared: 0.7664, Adjusted R-squared: 0.7616
## F-statistic: 160.6 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.01759599994699"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.619]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.19384430300736"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.768]
print('effects model, jacknifed')
## [1] "effects model, jacknifed"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
# dTrainB <- bCoder$codeFrame(dTrain)
# dTrainB <- bCoder$codeFrame(dCal)
dTrainB <- jackknifeEffectCodeR(dTrain,yName,vars)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0503 -0.7460 0.0110 0.6973 3.9378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.080580 0.024499 3.289 0.00102 **
## x1 0.878376 0.036480 24.078 < 2e-16 ***
## x2 0.891187 0.062232 14.320 < 2e-16 ***
## x3 0.769654 0.098820 7.788 1.09e-14 ***
## x4 0.944314 0.043679 21.619 < 2e-16 ***
## x5 0.945152 0.042897 22.033 < 2e-16 ***
## x6 0.849552 0.069355 12.249 < 2e-16 ***
## x7 0.949746 0.037648 25.227 < 2e-16 ***
## x8 1.043654 0.036410 28.664 < 2e-16 ***
## x9 0.962349 0.033361 28.847 < 2e-16 ***
## x10 0.925969 0.029086 31.836 < 2e-16 ***
## n1 0.027136 0.020443 1.327 0.18454
## n2 -0.009259 0.020976 -0.441 0.65896
## n3 0.031421 0.020508 1.532 0.12565
## n4 -0.030794 0.020589 -1.496 0.13491
## n5 -0.020427 0.020843 -0.980 0.32720
## n6 0.009093 0.021514 0.423 0.67261
## n7 0.013300 0.021150 0.629 0.52953
## n8 0.008724 0.020401 0.428 0.66897
## n9 -0.001165 0.020370 -0.057 0.95440
## n10 -0.001888 0.021453 -0.088 0.92990
## n11 0.012744 0.020469 0.623 0.53364
## n12 -0.001472 0.020607 -0.071 0.94306
## n13 -0.034807 0.020765 -1.676 0.09385 .
## n14 0.019618 0.020409 0.961 0.33655
## n15 0.027026 0.020765 1.302 0.19323
## n16 -0.002004 0.019604 -0.102 0.91859
## n17 0.003144 0.020437 0.154 0.87777
## n18 0.008286 0.020588 0.402 0.68738
## n19 -0.040983 0.021189 -1.934 0.05324 .
## n20 -0.012028 0.020187 -0.596 0.55136
## n21 -0.024144 0.021062 -1.146 0.25179
## n22 0.016472 0.020233 0.814 0.41567
## n23 -0.012645 0.020121 -0.628 0.52976
## n24 0.020160 0.021181 0.952 0.34132
## n25 -0.049179 0.020551 -2.393 0.01680 *
## n26 0.021604 0.020720 1.043 0.29723
## n27 -0.022748 0.021124 -1.077 0.28166
## n28 -0.008041 0.020318 -0.396 0.69231
## n29 -0.036017 0.021344 -1.687 0.09167 .
## n30 0.009637 0.020581 0.468 0.63968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.095 on 1959 degrees of freedom
## Multiple R-squared: 0.7349, Adjusted R-squared: 0.7295
## F-statistic: 135.8 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.08397286675349"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'effects model train, jackknifed',
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.941]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.0755272623194"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'effects model test, jackknifed',
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1090]
print("vtreat split model")
## [1] "vtreat split model"
pruneSig = 0.05
print("working vtreat split model")
## [1] "working vtreat split model"
mTitle <- 'vtreat split model'
isTrain <- runif(nrow(dTrain))<=0.5
dTrainDT <- dTrain[isTrain,]
dTrainDC <- dTrain[!isTrain,]
treatments <- vtreat::designTreatmentsN(dTrainDC,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
verbose=FALSE,
parallelCluster=cl)
dTrainV <- vtreat::prepare(treatments,dTrainDT,pruneSig=pruneSig,
parallelCluster=cl)
#print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
## [1] "x1_catN" "x4_catN" "x5_catN" "x7_catN" "x8_catN" "x9_catN"
## [7] "x10_catN"
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
## [1] "train rmse 1.26412700778996"
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1263]
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
## [1] "test rmse 1.246068093896"
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1412]
print("vtreat cross model")
## [1] "vtreat cross model"
pruneSig = 0.05
if("mkCrossFrameNExperiment" %in% ls('package:vtreat')) {
print("working vtreat cross model")
mTitle <- 'vtreat cross model'
crossD <- vtreat::mkCrossFrameNExperiment(dTrain,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
parallelCluster=cl)
treatments <- crossD$treatments
dTrainV <- crossD$crossFrame
# print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
print(summary(modelV))
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
} else {
print("cross model not in library")
}
## [1] "cross model not in library"
sSigma = 100
print(paste('effects model, smoothing=',sSigma))
## [1] "effects model, smoothing= 100"
bCoder <- trainEffectCoderLSmooth(dTrain,yName,vars,sSigma)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0123 -0.4071 0.0243 0.4038 2.0010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03157 0.01411 2.238 0.0253 *
## x1 0.46900 0.03433 13.661 < 2e-16 ***
## x2 0.43635 0.05605 7.785 1.12e-14 ***
## x3 0.35879 0.08629 4.158 3.35e-05 ***
## x4 0.48251 0.04054 11.902 < 2e-16 ***
## x5 0.49399 0.03994 12.369 < 2e-16 ***
## x6 0.36647 0.06119 5.989 2.50e-09 ***
## x7 0.45372 0.03566 12.725 < 2e-16 ***
## x8 0.51743 0.03583 14.443 < 2e-16 ***
## x9 0.44220 0.03236 13.667 < 2e-16 ***
## x10 0.46038 0.02866 16.066 < 2e-16 ***
## n1 2.50847 0.35889 6.990 3.77e-12 ***
## n2 2.48761 0.36780 6.764 1.77e-11 ***
## n3 3.15687 0.35059 9.004 < 2e-16 ***
## n4 2.56514 0.36398 7.048 2.51e-12 ***
## n5 1.88568 0.36896 5.111 3.52e-07 ***
## n6 2.19744 0.37914 5.796 7.90e-09 ***
## n7 2.08384 0.35166 5.926 3.66e-09 ***
## n8 2.36660 0.34844 6.792 1.46e-11 ***
## n9 2.46809 0.35716 6.910 6.51e-12 ***
## n10 2.20245 0.35444 6.214 6.30e-10 ***
## n11 2.68048 0.34446 7.782 1.15e-14 ***
## n12 1.99004 0.37429 5.317 1.18e-07 ***
## n13 2.61391 0.37424 6.984 3.90e-12 ***
## n14 2.43482 0.35298 6.898 7.09e-12 ***
## n15 2.57561 0.36112 7.132 1.38e-12 ***
## n16 2.43798 0.34197 7.129 1.41e-12 ***
## n17 2.25436 0.34132 6.605 5.11e-11 ***
## n18 2.52469 0.35400 7.132 1.39e-12 ***
## n19 2.50887 0.37768 6.643 3.97e-11 ***
## n20 2.24130 0.34401 6.515 9.21e-11 ***
## n21 2.25363 0.36182 6.229 5.75e-10 ***
## n22 2.29114 0.33994 6.740 2.08e-11 ***
## n23 1.94294 0.33639 5.776 8.88e-09 ***
## n24 2.42563 0.35719 6.791 1.47e-11 ***
## n25 1.98204 0.35585 5.570 2.90e-08 ***
## n26 2.16032 0.33883 6.376 2.26e-10 ***
## n27 2.35022 0.37638 6.244 5.21e-10 ***
## n28 2.07056 0.34846 5.942 3.32e-09 ***
## n29 2.03861 0.35614 5.724 1.20e-08 ***
## n30 2.27701 0.34654 6.571 6.40e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6266 on 1959 degrees of freedom
## Multiple R-squared: 0.9132, Adjusted R-squared: 0.9115
## F-statistic: 515.5 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.620113653647223"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1585]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.6938494493661"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1734]
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl <- NULL
}